kSanity-VIRGO metagenomics counts and rel. abundances QC

Author

Laura Symul, Laura Vermeren, Michael France

Published

October 20, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(labelled)
library(vegan)
library(RColorBrewer)

tmp <- fs::dir_map("R scripts/", source)
tmp <- fs::dir_map("../VIBRANT-99-utils/R/", source)
rm(tmp)

theme_set(theme_light())

Loading the data

Code
data_dir <- get_data_dir()
mg_dir <- str_c(data_dir, "02 Metagenomics/")

counts <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_ReadCounts_20250611.csv"))
counts_corr <- read_csv(str_c(mg_dir, "MVIBR_kSanityVirgo2_GLcorr_20250611.csv"))
relabs <- read_csv(str_c(mg_dir, "MVIBR_kSanityVirgo2_relAbund_20250611.csv"))
technical_metadata <- read_csv(str_c(mg_dir, "VIBRANT_MG_technicalMetaData_20250611.csv"), guess_max = Inf)
VIRGO2_taxonomic_table <- read_csv(str_c(mg_dir, "VIRGO2_taxonomy_key_250429.csv"))

LBP_strain_info <- readxl::read_xlsx(str_c(data_dir, "00 Trial Data/IsolateNumbers.xlsx"))

We load the following tables, all provided by Michael France:

  • technical_metadata technical metadata (dimensions: 1190 samples x 26 columns)

  • counts raw counts per LBP strain and taxa (dimensions: 999 samples x 783 columns)

  • counts_corr counts per LBP strain and taxa corrected by gene length (dimensions: 999 samples x 783 columns)

  • relabs relative abundances per LBP strain and taxa, computed by M. France from counts_corr (dimensions: 999 samples x 783 columns)

  • VIRGO2_taxonomic_table taxonomic table for the taxa as defined in VIRGO2 (dimensions: 779 taxa x 10 columns)

Manifest QC

In this section, we check, harmonize, and transform the manifest and technical metadata provided by Michael France.

Code
technical_metadata_original <- technical_metadata

Column name cleaning and description

The technical_metadata table contains the following columns:

Code
technical_metadata |> glimpse()
Rows: 1,190
Columns: 26
$ UID                           <chr> "MG_202", "MG_253", "MG_298", "MG_331", …
$ PoorQualityMGData             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ NoMGData                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ SampleNumber                  <chr> "202", "253", "298", "331", "334", "360"…
$ PID                           <chr> "068-10-0004", "068-10-0004", "068-10-00…
$ VisitCode                     <chr> "0", "1000", "1100", "1200", "0", "1000"…
$ VisitCodeFlagged              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ SampleType                    <chr> "ClinicalSample", "ClinicalSample", "Cli…
$ Ext_Lib_Plate                 <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9…
$ Ext_Lib_Plate_ID              <dbl> 2316392, 2316392, 2316392, 2316392, 2316…
$ Ext_Lib_Position              <chr> "A1", "B1", "C1", "D1", "E1", "F1", "G1"…
$ SequencingRun                 <chr> "250213_A00904_0436_BH2HCCDSXF", "250213…
$ DateSequenced                 <dbl> 250213, 250213, 250213, 250213, 250213, …
$ Lane                          <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ Sample                        <chr> "NA19356-001", "NA19357-001", "NA19358-0…
$ `Library Pool`                <chr> "IP709", "IP709", "IP709", "IP711", "IP7…
$ Library                       <chr> "IL19356-001", "IL19357-001", "IL19358-0…
$ FragmentSize                  <dbl> 388, 384, 425, 384, 386, 405, 379, 379, …
$ IndexSet                      <chr> "N3UD-A01", "N3UD-B01", "N3UD-C01", "N3U…
$ `Index 1`                     <chr> "TGTCGTAG", "CAATCATA", "GTTCTTAT", "GAT…
$ `Index 2`                     <chr> "AAAGCTAA", "TGGAGATT", "AATTAGAC", "ACT…
$ BioinformaticsProcessingBatch <dbl> 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4…
$ `Selected4re-extraction`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `Re-extracted_data_used`      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LibrarySequencedTwice         <dbl> NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, N…
$ Notes                         <chr> NA, NA, NA, NA, NA, NA, "requeue for add…
Code
# checks
all(technical_metadata$UID == str_c("MG_", technical_metadata$SampleNumber), na.rm = TRUE)
technical_metadata |> dplyr::filter(UID != str_c("MG_", SampleNumber))
Code
dictionary <- 
  tibble(original_name = colnames(technical_metadata)) |> 
  mutate(
    description = 
      case_when(
        (original_name == "UID") ~ '"MG_" followed by the `SampleNumber`',
        (original_name == "PoorQualityMGData") ~ "Whether the sample has poor quality metagenomics data.", 
        (original_name == "NoMGData") ~ "Whether the sample has no metagenomics data.",
        (original_name == "SampleNumber") ~ 'Vaginal swab barcode for metagenomics and qPCR',
        (original_name == "PID") ~ 'Participant ID as provided in the metagenomics manifest',
        (original_name == "VisitCode") ~ 'Visit code (double)',
        (original_name == "VisitCodeFlagged") ~ '1 if Michael flagged the visit code as unexpected',
        (original_name == "SampleType") ~ 'Sample type (e.g., Clinical sample, Control, etc.)',
        (original_name == "Ext_Lib_Plate") ~ 'Extraction library plate number',
        (original_name == "Ext_Lib_Plate_ID") ~ 'Extraction library plate ID',
        (original_name == "Ext_Lib_Position") ~ 'Well position of sample on the extraction library plate',
        (original_name == "SequencingRun") ~ 'Sequencing run ID',
        (original_name == "DateSequenced") ~ 'Sequencing date',
        (original_name == "Lane") ~ 'Sequencing lane',
        (original_name == "Sample") ~ 'Sequencing sample ID',
        (original_name == "Library Pool") ~ 'Library pool',
        (original_name == "Library") ~ 'Pooled library ID',
        (original_name == "FragmentSize") ~ 'Size of fragment',
        (original_name == "IndexSet") ~ 'Index set ID',
        (original_name == "Index 1") ~ 'Index 1',
        (original_name == "Index 2") ~ 'Index 2',
        (original_name == "BioinformaticsProcessingBatch") ~ 'Bioinformatics processing batch number',
        (original_name == "Selected4re-extraction") ~ 'Whether the sample had been selected for re-extraction by Michael (poor signal samples and a few random good signal samples).',
        (original_name == "Re-extracted_data_used") ~ 'Whether the re-extracted data was used for the sample.',
        (original_name == "LibrarySequencedTwice") ~ 'Whether a sample has been sequenced twice (to increase coverage).',
        (original_name == "Notes") ~ 'Notes from Michael France',
        TRUE ~ "????"
      ),
    print_label = 
      case_when(
        (original_name == "UID") ~ 'Metagenomics unique sample identifier',
        (original_name == "PoorQualityMGData") ~ "Poor quality MG data", 
        (original_name == "NoMGData") ~ "No MG data",
        (original_name == "SampleNumber") ~ 'Vaginal swab barcode',
        (original_name == "PID") ~ 'Participant ID',
        (original_name == "VisitCode") ~ 'Visit code',
        (original_name == "VisitCodeFlagged") ~ 'Unexpected visit code',
        (original_name == "SampleType") ~ 'Sample type',
        (original_name == "Ext_Lib_Plate") ~ 'Extraction library plate nb',
        (original_name == "Ext_Lib_Plate_ID") ~ 'Extraction library plate ID',
        (original_name == "Ext_Lib_Position") ~ 'Well position of sample on the extraction library plate',
        (original_name == "SequencingRun") ~ 'Sequencing run ID',
        (original_name == "DateSequenced") ~ 'Sequencing date',
        (original_name == "Lane") ~ 'Sequencing lane',
        (original_name == "Sample") ~ 'Sequencing sample ID',
        (original_name == "Library Pool") ~ 'Library pool',
        (original_name == "Library") ~ 'Pooled library ID',
        (original_name == "FragmentSize") ~ 'Size of fragment',
        (original_name == "IndexSet") ~ 'Index set ID',
        (original_name == "Index 1") ~ 'Index 1',
        (original_name == "Index 2") ~ 'Index 2',
        (original_name == "BioinformaticsProcessingBatch") ~ 'Bioinformatics processing batch nb',
        (original_name == "Selected4re-extraction") ~ 'Selected for re-extraction',
        (original_name == "Re-extracted_data_used") ~ 'Re-extracted data was used',
        (original_name == "LibrarySequencedTwice") ~ 'Sequenced twice',
        (original_name == "Notes") ~ 'Notes',
        TRUE ~ "????"
      )
  )

We rename and harmonize the column names of the technical metadata.

Code
technical_metadata <- 
  technical_metadata |> 
  janitor::clean_names() |>
  dplyr::rename(
    mg_uid = uid,
    mg_pid = pid,
    mg_visit_code = visit_code,
    mg_sample_type = sample_type,
    swab_barcode = sample_number,
    ext_lib_plate_nb = ext_lib_plate,
    sequencing_sample_id = sample,
    sequencing_lane = lane,
    sequencing_date = date_sequenced,
    selected_for_re_extraction = selected4re_extraction
  ) 
Code
dictionary <- 
  dictionary |>  
  mutate(name = colnames(technical_metadata)) |> 
  select(name, original_name, description, print_label)
Code
dictionary |> 
  select(-print_label) |> 
  gt() |> 
  cols_label(
    name = "New column name",
    original_name = "Original column name",
    description = "Description"
  ) 
New column name Original column name Description
mg_uid UID "MG_" followed by the `SampleNumber`
poor_quality_mg_data PoorQualityMGData Whether the sample has poor quality metagenomics data.
no_mg_data NoMGData Whether the sample has no metagenomics data.
swab_barcode SampleNumber Vaginal swab barcode for metagenomics and qPCR
mg_pid PID Participant ID as provided in the metagenomics manifest
mg_visit_code VisitCode Visit code (double)
visit_code_flagged VisitCodeFlagged 1 if Michael flagged the visit code as unexpected
mg_sample_type SampleType Sample type (e.g., Clinical sample, Control, etc.)
ext_lib_plate_nb Ext_Lib_Plate Extraction library plate number
ext_lib_plate_id Ext_Lib_Plate_ID Extraction library plate ID
ext_lib_position Ext_Lib_Position Well position of sample on the extraction library plate
sequencing_run SequencingRun Sequencing run ID
sequencing_date DateSequenced Sequencing date
sequencing_lane Lane Sequencing lane
sequencing_sample_id Sample Sequencing sample ID
library_pool Library Pool Library pool
library Library Pooled library ID
fragment_size FragmentSize Size of fragment
index_set IndexSet Index set ID
index_1 Index 1 Index 1
index_2 Index 2 Index 2
bioinformatics_processing_batch BioinformaticsProcessingBatch Bioinformatics processing batch number
selected_for_re_extraction Selected4re-extraction Whether the sample had been selected for re-extraction by Michael (poor signal samples and a few random good signal samples).
re_extracted_data_used Re-extracted_data_used Whether the re-extracted data was used for the sample.
library_sequenced_twice LibrarySequencedTwice Whether a sample has been sequenced twice (to increase coverage).
notes Notes Notes from Michael France

We also make some basic column type transformations (to logical, characters, numeric, etc.)

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
    poor_quality_mg_data = (poor_quality_mg_data == 1),
    no_mg_data = (no_mg_data == 1),
    visit_code_flagged = (visit_code_flagged == 1),
    sequencing_date = lubridate::as_date(sequencing_date |> as.character(), format = "%y%m%d"),
    selected_for_re_extraction = (selected_for_re_extraction == 1) |> replace_na(FALSE),
    re_extracted_data_used = (re_extracted_data_used == 1) |> replace_na(FALSE),
    library_sequenced_twice = (library_sequenced_twice == 1) |> replace_na(FALSE),
  )

FIXES to visit codes based on feedback from the CAPRISA team

Based on data received from Nireshni Mitchev on May 27th 2025, we fix the visit codes for one sample.

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
    mg_visit_code = 
      case_when(
        (swab_barcode == "2271051") ~ "10",
        # (swab_barcode == "2280392") ~ 10,
        TRUE ~ mg_visit_code
      )
  )

Sample and control type harmonization

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
    sample_type = mg_sample_type |> str_replace("lS", "l s") |> str_replace("b\\+C", "b \\+ C"),
    control_type = 
      case_when(
        sample_type %in% c("Clinical sample", "Unknown") ~ "",
        TRUE ~ sample_type
      ) |> factor(),
    sample_type = 
      case_when(
        str_detect(sample_type, "Unknown") | str_detect(mg_pid, "^068.*[xX]$") ~ "Test sample",
        str_detect(sample_type, "Clinical") ~ "Clinical sample",
        str_detect(sample_type, "Mock") ~ "Positive control",
        TRUE ~ "Negative control"
      ) |> 
      fct_infreq()
  ) |> 
  relocate(sample_type, control_type, .after = mg_sample_type) 

technical_metadata |> dplyr::count(sample_type, mg_sample_type, control_type) |> gt()
sample_type mg_sample_type control_type n
Clinical sample ClinicalSample 1137
Positive control Mock 1 Mock 1 12
Positive control Mock 2 Mock 2 11
Negative control Nuclease-free water Nuclease-free water 11
Negative control Unused swab+C2 Unused swab + C2 11
Test sample ClinicalSample 4
Test sample Unknown 4
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = c("sample_type", "control_type"),
      description = c("Sample type (harmonized)", "Control type (Mock 1, Mock 2, etc.)"),
      print_label = c("Sample type", "Control type")
    )
  ) |> 
  distinct()

Participant ID harmonization

In the technical metadata, the participant ID is provided in three different formats.

We harmonize them to match the pattern 068[12]0[0-9]{4} (e.g., “068101234”).

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
      pid = 
        case_when(
          str_detect(mg_pid, "068-") ~ mg_pid |> str_remove_all("-"),
          str_detect(mg_pid, "^68") ~ str_c("0", mg_pid),
          TRUE ~ mg_pid
        )
    ) |> 
    relocate(pid, .before = mg_pid)
Code
technical_metadata |> 
  mutate(pid_length = str_length(pid)) |> 
  dplyr::filter(pid_length != 9) |> 
  View()

There are a few samples from a participant from a different study (“CAP098”):

Code
technical_metadata |> 
  mutate(pid_length = str_length(pid)) |> 
  dplyr::filter(pid_length != 9, sample_type == "Clinical sample") |> 
  select(mg_uid, swab_barcode, pid, mg_pid, mg_visit_code, ext_lib_plate_nb) |> 
  gt()
mg_uid swab_barcode pid mg_pid mg_visit_code ext_lib_plate_nb
MG_2285783 2285783 98120009 98120009 1010 5
MG_2285793 2285793 98120009 98120009 1020 5
MG_2285794 2285794 98120009 98120009 1030 5
MG_2285800 2285800 98120009 98120009 1040 5
MG_2285806 2285806 98120009 98120009 1050 5
MG_2285808 2285808 98120009 98120009 1060 5
MG_2285809 2285809 98120009 98120009 1070 5

We modify the sample_type of these samples to “Clinical sample (other study)”.

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
    sample_type = 
      ifelse(
        str_detect(pid, "^98"), 
        "Clinical sample (other study)", 
        sample_type |> as.character()
        ) |> factor()
    ) 

Visit codes are transformed to the desired format (i.e., a 4 digit character string):

Code
technical_metadata <-
  technical_metadata |> 
  mutate(
    visit_code = 
      case_when(
        mg_visit_code == "V5" ~ "1400",
        mg_visit_code == "V6" ~ "1500",
        TRUE ~ mg_visit_code |> as.character() |> str_pad(width = 4, pad = "0")
      )
    ) |> 
  relocate(visit_code, .before = mg_visit_code)
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = c("pid", "visit_code"),
      description = c("Harmonized participant ID", "Harmonized visit code"),
      print_label = c("Participant ID", "Visit code")
    )
  ) |> 
  distinct()

Samples sequenced twice

Code
samples_sequenced_twice <- 
  technical_metadata |> 
  group_by(mg_uid) |> 
  summarize(n_rows = n(), library_sequenced_twice = sum(library_sequenced_twice, na.rm = TRUE)) |> 
  arrange(-n_rows)

# samples_sequenced_twice |> dplyr::count(n_rows, library_sequenced_twice)

samples_sequenced_twice <- 
  samples_sequenced_twice |> 
  dplyr::filter(n_rows > 1) 

# sum(technical_metadata$library_sequenced_twice, na.rm = TRUE) == nrow(samples_sequenced_twice)

161 samples have been sequenced twice to increase the coverage. The technical metadata contains a column library_sequenced_twice (prev. LibrarySequencedTwice) that indicates whether a sample has been sequenced twice. Since the data is merge after sequencing and before bioinformatics processing (VIRGO2 and kSANITY), we need to merge the technical metadata for these samples so they match the counts, counts_corr and relabs data.

Code
samples_sequenced_twice |> 
  gt(caption = "Samples that have been sequenced twice") 
Code
rm(samples_sequenced_twice)

Since metagenomic reads from re-sequenced samples have either been merged or discarded, we aggregate the technical metadata such that we only have one row per sample (mg_uid or uid).

Code
# we first identify columns that may contain different information for re-sequenced genes
# (we expect sequencing-related columns)

cols_to_aggregate <-
  technical_metadata |> 
  group_by(mg_uid) |> 
  arrange(sequencing_date) |> 
  summarise(
    across(everything(), ~ unique(.) |> length())
  ) |> 
  select(-mg_uid) |>
  summarise(across(everything(), ~ sum(. > 1))) |> 
  pivot_longer(cols = everything(), names_to = "column", values_to = "needs_aggregation") |> 
  dplyr::filter(needs_aggregation > 0) |> 
  pull(column)

# We aggregate these columns

technical_metadata_agg <- 
  technical_metadata |> 
  select(mg_uid, all_of(cols_to_aggregate)) |> 
  group_by(mg_uid) |> 
  arrange(sequencing_date) |> 
  summarise(
    latest_sequencing_date = max(sequencing_date),
    across(all_of(cols_to_aggregate), ~ unique(.) |> na.omit() |> str_c(collapse = "; "))
  ) |> 
  mutate(library_sequenced_twice = str_detect(library_sequenced_twice, "TRUE")) 

# Then merge with the unique values for the other columns

technical_metadata_agg <- 
  technical_metadata |> 
  select(-all_of(cols_to_aggregate)) |>
  distinct() |> 
  dplyr::left_join(technical_metadata_agg, by = join_by(mg_uid))

Matching the sequenced samples with the expected list of samples (CRF35)

We now match the technical metadata with the expected list of samples from CRF35 (Specimen collection).

Code
crf_files <- 
  get_01_output_dir() |> 
  fs::dir_ls() |> 
  str_subset("/01_")

visits_file <- 
  crf_files |> 
  str_subset("01_visits_2025") |> 
  sort(decreasing = TRUE) |> 
  extract(1)

load(visits_file, verbose = TRUE)
Loading objects:
  visits
Code
participants_file <- 
  crf_files |> 
  str_subset("01_participants_2025") |> 
  sort(decreasing = TRUE) |> 
  extract(1)

load(participants_file, verbose = TRUE)
Loading objects:
  participants
Code
crf_file <- 
  crf_files |> 
  str_subset("01_crf_clean_") |> 
  sort(decreasing = TRUE) |>
  extract(1)

load(crf_file, verbose = TRUE)
Loading objects:
  crf_clean
Code
matched <- 
  dplyr::full_join(
    visits |> 
      select(pid, visit_code, specimen_collection_swab) |> 
      mutate(in_CRF = TRUE),
    technical_metadata_agg |> 
      select(pid, visit_code, sample_type, control_type, swab_barcode, mg_pid, no_mg_data) |> 
      mutate(
        in_mg_manifest = TRUE,
        has_mg_data = (no_mg_data != 1)
        ),
    by = join_by(pid, visit_code)
  ) |> 
  dplyr::left_join(
    participants |> 
      select(pid, site, randomized) |> 
      distinct() |> 
      mutate(pid_in_CRF = TRUE),
    by = join_by(pid)
  ) |> 
  mutate(
    randomized = randomized |> replace_na(FALSE),
    in_CRF = in_CRF |> replace_na(FALSE),
    in_mg_manifest = in_mg_manifest |> replace_na(FALSE),
    has_mg_data = has_mg_data |> replace_na(FALSE) ,
    pid_in_CRF = pid_in_CRF |> replace_na(FALSE),
    specimen_collection_swab = specimen_collection_swab |> replace_na(0)
  )
Code
matched <- 
  matched |> 
  group_by(pid, visit_code) |> 
  mutate(n_mg_rows = n()) |>
  ungroup() |> 
  mutate(
    category = 
      case_when(
        sample_type == "Clinical sample (other study)" ~ "Clinical sample from another study",
        sample_type != "Clinical sample" ~ "Not a clinical sample",
        has_mg_data & !pid_in_CRF ~ "Unexpected participant ID",
        has_mg_data & !randomized ~ "Sequenced sample from non-randomized participants",
        has_mg_data & !in_CRF ~ "Sequenced sample not listed in CRF35",
        !in_mg_manifest & in_CRF & (specimen_collection_swab > 0) & randomized ~ "Sample from a randomized participant not in mg manifest",
        !has_mg_data & in_CRF & (specimen_collection_swab > 0) & randomized ~ "Sample from a randomized participant not sequenced",
        !in_mg_manifest & in_CRF & (specimen_collection_swab > 0) & (!randomized) ~ "Sample from a non-randomized participant not in mg manifest",
        !has_mg_data & in_CRF & (specimen_collection_swab > 0) & (!randomized) ~ "Sample from a non-randomized participant not sequenced",
        !has_mg_data & in_CRF & (specimen_collection_swab == 0) ~ "No swab collected according to CRF35 (and no MG data)",
        has_mg_data & in_CRF & (specimen_collection_swab == 0) ~ "Unexpected sequencing data (no swab collected according to CRF35)",
        has_mg_data & in_CRF ~ "Expected sample",
        TRUE ~ "???"
      )
  ) 
Code
matched |> 
  mutate(randomized = ifelse(randomized, "Randomized", "Not randomized")) |> 
  dplyr::count(randomized, has_mg_data, category, name = "n samples") |> 
  arrange(randomized, has_mg_data, -`n samples`) |> 
  dplyr::rename(
    `Sample category` = category,
    `Randomized` = randomized,
  ) |>
  group_by(Randomized) |> 
  gt(row_group_as_column = TRUE) 
has_mg_data Sample category n samples
Not randomized FALSE Sample from a non-randomized participant not in mg manifest 270
FALSE No swab collected according to CRF35 (and no MG data) 144
FALSE Not a clinical sample 29
FALSE Sample from a non-randomized participant not sequenced 1
TRUE Sequenced sample from non-randomized participants 73
TRUE Not a clinical sample 24
TRUE Clinical sample from another study 7
Randomized FALSE No swab collected according to CRF35 (and no MG data) 3846
FALSE Sample from a randomized participant not in mg manifest 10
FALSE Sample from a randomized participant not sequenced 2
TRUE Expected sample 891
TRUE Unexpected sequencing data (no swab collected according to CRF35) 2
Code
plot_sample_categories <- function(matched){
  
  matched |> 
    mutate(
      pid = pid |> fct_infreq(), 
      randomized = ifelse(randomized, "Randomized", "Not randomized") |> factor() |> fct_rev()
    ) |> 
    filter(
      has_mg_data | (specimen_collection_swab > 0)
    ) |> 
    ggplot() +
    aes(x = pid, y = visit_code, fill = category) +
    geom_tile() +
    facet_grid(. ~ site + randomized, scales = "free", space = "free") +
    scale_fill_manual(values = c(
      "Expected sample" = "steelblue1",
      "No swab collected according to CRF35 (and no MG data)" = "gray80",
      "Not a clinical sample" = "gray60",
      "Clinical sample from another study" = "gray40",
      "Sample from a randomized participant not sequenced" = "red",
      "Sample from a randomized participant not in mg manifest" = "red3",
      "Sample from a non-randomized participant not sequenced" = "red4",
      "Sample from a non-randomized participant not in mg manifest" = "black",
      "Sequenced sample from non-randomized participants" = "steelblue3",
      "Sequenced sample not listed in CRF35" = "orange2",
      "Unexpected sequencing data (no swab collected according to CRF35)" = "orange1",
      "Unexpected participant ID" = "purple"
    )) +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )
  
}
Code
plot_sample_categories(matched) + ggtitle("Combined rows from the MG manifest and CRF35 data")

Code
plot_sample_categories(matched |> dplyr::filter(randomized)) + 
  ggtitle('Randomized participants')

There are also two samples that were not listed in CRF35 but have been sequenced - these are likely daily swabs that were sent by mistake to the Ravel lab.

Code
matched |> 
  dplyr::filter(category == "Unexpected sequencing data (no swab collected according to CRF35)") |> 
  gt()
pid visit_code Number of self collected swabs in_CRF sample_type control_type swab_barcode mg_pid no_mg_data in_mg_manifest has_mg_data site randomized pid_in_CRF n_mg_rows category
068200278 1001 0 TRUE Clinical sample 2280392 68200278 FALSE TRUE TRUE CAP TRUE TRUE 1 Unexpected sequencing data (no swab collected according to CRF35)
068200310 1104 0 TRUE Clinical sample 2269977 68200310 FALSE TRUE TRUE CAP TRUE TRUE 1 Unexpected sequencing data (no swab collected according to CRF35)

We add a column to the manifest data with the sample categories defined above.

Code
technical_metadata_agg <- 
  technical_metadata_agg |> 
  dplyr::left_join(
    matched |> select(pid, visit_code, category) |> distinct() |> 
      dplyr::rename(sample_category = category)
  )

technical_metadata_agg |> dplyr::count(sample_category) |> gt(caption = "Sample categories of the sequenced samples")
Sample categories of the sequenced samples
sample_category n
Clinical sample from another study 7
Expected sample 891
Not a clinical sample 53
Sample from a non-randomized participant not sequenced 1
Sample from a randomized participant not sequenced 2
Sequenced sample from non-randomized participants 73
Unexpected sequencing data (no swab collected according to CRF35) 2
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = "sample_category",
      description = "Sample classification based on the data about collected specimen as documented in CRF35.",
      print_label = c("Sample category")
    )
  ) |> 
  distinct()

Defining the unique cross-assay identifier

The unique cross-assay identifier (uid) is

  • the concatenation of pid and visit_code for the “expected” clinical samples.
  • the swab_barcode for the non-clinical samples and the “unexpected” clinical samples.
Code
technical_metadata_agg <- 
  technical_metadata_agg |> 
  group_by(pid, visit_code) |> mutate(i = row_number()) |> ungroup() |> 
  mutate(
    uid = 
      case_when(
        str_detect(sample_type, "Clinical sample") & (i > 1) ~ str_c(pid, "_", visit_code, "_", i),
        str_detect(sample_type, "Clinical sample") ~ str_c(pid, "_", visit_code),
        !(is.na(swab_barcode) | (swab_barcode == ""))  ~ swab_barcode,
        TRUE ~ mg_uid |> str_remove("MG_")
      ),
  ) |> 
  relocate(uid, .before = mg_uid) |> 
  select(-i)
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = "uid",
      description = "Unique sample identifier - common ID across VIBRANT assays.",
      print_label = c("Sample ID")
    )
  ) |> 
  distinct()

Creating a SummarizedExperiment object from these tables

The SummarizedExperiment object will contain the following assays:

  • count
  • count_corr
  • rel_ab

We note some negative (although extremely small) values in the count, count_corr, and rel_ab assays (all for native L. crispatus, I assume it being an artifact from kSANITY). We will set these to 0.

The SE colData will contain the technical metadata, the total number of reads for each sample, along with the metagenomics-based CST, subCST, and VALENCIA’s assignment scores. For samples that have been sequenced twice, we will merge the sequencing and processing information from both sequencing runs.

The SE rowData will contain the VIRGO2 taxonomic table, augmented with the LBP strain information.

The LBP strain information is loaded as LBP_strain_info from the VIBRANT Dropbox file that contains information on the LBP strains

Since the names of the strains do not match exactly those used by Michael, we modify them to match those provided by Michael.

Code
LBP_strain_info <- 
  LBP_strain_info |> 
  mutate(
    strain_id = `VMRC ID`, 
    LBP = ifelse(is.na(LC106), "LC-115", "LC-106 & LC-115") |> factor(), 
    strain_origin = `Geographic source` |> factor()
  ) |>
  dplyr::rename(Biose_ID = `Biose ID`) |> #VMRC_ID = `VMRC ID` 
  dplyr::select(strain_id, LBP, strain_origin, Biose_ID) |> 
  arrange(strain_origin, LBP) |> 
  mutate(strain_id = strain_id |> fct_inorder()) |> 
  dplyr::select(strain_id, LBP, strain_origin, contains("ID")) |> 
  mutate(
    strain_id_mg = sub("_.*", "", strain_id),
    strain_id_mg = 
      case_when(
        strain_id_mg == "CC0028A1" ~ "C0028A1", 
        TRUE ~ strain_id_mg
      )
  )

LBP_strain_info |>   
  set_variable_labels(
    strain_id = "Strain ID", 
    strain_id_mg = "Strain ID (as in metagenomics data)",
    strain_origin = "Strain origin", 
    Biose_ID = "Biose ID"
  ) |> 
  gt(caption = "LBP strain information") |> 
  tab_style(style = cell_text(weight = "bold"),
            locations = cells_column_labels())
LBP strain information
Strain ID LBP Strain origin Biose ID Strain ID (as in metagenomics data)
FF00018 LC-106 & LC-115 SA GA08 FF00018
FF00051 LC-106 & LC-115 SA GA09 FF00051
UC101_2_33 LC-106 & LC-115 SA GA12 UC101
FF00004 LC-115 SA GA07 FF00004
FF00064 LC-115 SA GA10 FF00064
FF00072 LC-115 SA GA11 FF00072
UC119_2_PB0238 LC-115 SA GA13 UC119
122010_1999_16 LC-115 SA GA14 122010
185329_1999_17 LC-115 SA GA15 185329
C0059E1 LC-106 & LC-115 US GA03 C0059E1
C0022A1 LC-106 & LC-115 US GA04 C0022A1
C0175A1 LC-106 & LC-115 US GA06 C0175A1
C0006A1 LC-115 US GA01 C0006A1
CC0028A1 LC-115 US GA02 C0028A1
C0112A1 LC-115 US GA05 C0112A1

We also add the manifest column dictionary to the @metadata slot of the SE object.

Code
mg_to_SE <- function(counts, counts_corr, relabs, technical_metadata_agg, dictionary, VIRGO2_taxonomic_table, LBP_strain_info) {
  
  ## ASSAYS
  assay_count <- 
    counts |> 
    mutate(mg_uid = sampleID) |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("mg_uid") |> 
    drop_na() |> 
    t() |> 
    pmax(0)
  
  assay_count_corr <- 
    counts_corr |> 
    mutate(mg_uid = sampleID) |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("mg_uid") |>
    t() |> 
    pmax(0)
  
  assay_relative_ab <- 
    relabs |> 
    mutate(mg_uid = sampleID) |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("mg_uid") |>
    t() |> 
    pmax(0)
    
  
  ## colData 
  
  # We build the SE colData from the the technical metadata
  # to which we add the total number of non-human reads
  # and the cst related data
  se_coldata <- 
    technical_metadata_agg |> 
    dplyr::left_join(
      counts |> 
        mutate(mg_uid = sampleID) |> 
        select(mg_uid, CST, subCST, score) |>
        dplyr::rename(
          cst = CST,
          sub_cst = subCST,
          valencia_score = score
        ) |> 
        mutate(
          total_non_human_reads = 
            rowSums(counts |> select(-c(sampleID, CST, subCST, score)))
        ), 
      by = join_by(mg_uid)
    ) |> 
    select(
      uid, mg_uid, sample_type, control_type, 
      total_non_human_reads, poor_quality_mg_data, 
      cst, sub_cst, valencia_score, 
      everything()
      ) |> 
    arrange(sample_type, uid) |> 
    dplyr::filter(!is.na(total_non_human_reads))
  
  if (any(se_coldata$no_mg_data)) 
    warning(
      "Some samples have no metagenomics data (no_mg_data == TRUE); this should NOT be the case."
    )
  se_coldata <- se_coldata |> select(-no_mg_data) 
  
  dictionary <- 
    dictionary |> 
    bind_rows(
      tibble(name = "total_non_human_reads", description = "Total `counts` per sample"),
      tibble(name = "cst", original_name = "CST", description = "VALENCIA CST based on metagenomic taxonomic composition"),
      tibble(name = "sub_cst", original_name = "subCST", description = "VALENCIA sub-CST based on metagenomic taxonomic composition"),
      tibble(name = "valencia_score", original_name = "score", description = "VALENCIA assignment score based on metagenomic taxonomic composition")
    ) 
  
  dictionary <- 
    tibble(name = colnames(se_coldata)) |> 
    dplyr::left_join(dictionary, by = join_by(name)) |> 
    mutate(table = "colData")
  
  se_coldata <- se_coldata |> as.data.frame()
  
  ## rowData
  se_rowdata <- 
    tibble(
      taxon_id = 
        counts |> 
        dplyr::select(-c(sampleID, CST, subCST, score)) |>
        colnames()
    ) |> 
    dplyr::left_join(
      VIRGO2_taxonomic_table |> 
        mutate(
          taxon_id = Taxa,
          last_available_taxonomic_level = 
            Full_taxonomy |> str_remove_all(".*;") |> str_remove_all("_.*")
        ) |> 
        relocate(Full_taxonomy, .after = Species) |> 
        dplyr::rename(taxon_category = Category),
      by = join_by(taxon_id)
    ) |> 
    dplyr::left_join(
      LBP_strain_info |> 
        dplyr::rename(taxon_id = strain_id_mg),
      by = join_by(taxon_id)
    ) |> 
    mutate(
      taxon_label = 
        ifelse(!is.na(LBP), "LBP strain ", "") |> 
        str_c(taxon_id |> str_replace_all("_", " ")) |> 
        str_c(
          ifelse(
            last_available_taxonomic_level %in% c("g","f"), 
            str_c(" (", last_available_taxonomic_level, ")"), ""
          )
        ) |> 
        str_c(
          ifelse(taxon_id == "Lactobacillus_crispatus", " (undifferentiated)", "")
        ) |> 
        str_c(
          ifelse(taxon_id == "Pichia_kudriavzevii", " (prev. Candida krusei)", "")
        ) |> 
        str_c(
          ifelse(taxon_id == "UBA629_sp005465875", " (BVAB1 / Ca. Lachnocurva v.)", "")
        )
    ) |> 
    relocate(taxon_label, .after = taxon_id) |>
    janitor::clean_names() |>
    dplyr::rename(LBP = lbp) |> 
    as.data.frame() |> 
    select(-taxa) |> 
    mutate(
      taxon_category = taxon_category |> str_replace("HumanVirus", "Human virus")
    )
  
  dictionary <- 
    dictionary |> 
    bind_rows(
      tibble(
        name = colnames(se_rowdata)
      ) |> 
        mutate(
          description = 
            case_when(
              name == "taxon_id" ~ "Taxon ID (as in the VIRGO2 taxonomic table)",
              name == "taxon_label" ~ "Taxon label (human-readable)",
              name == "taxon_category" ~ "Taxon category (Bacteria, protist, etc.)",
              name == "domain" ~ "Domain",
              name == "phylum" ~ "Phylum",
              name == "class" ~ "Class",
              name == "order" ~ "Order",
              name == "family" ~ "Family",
              name == "genus" ~ "Genus",
              name == "species" ~ "Species",
              name == "full_taxonomy" ~ "Full taxonomic classification (concatenated)",
              name == "last_available_taxonomic_level" ~ "Last available taxonomic level",
              name == "strain_id" ~ "LBP strain ID",
              name == "LBP" ~ "The LBP product(s) in which this strain was included.",
              name == "strain_origin" ~ "LBP strain origin (US or SA)",
              name == "biose_id" ~ "LBP strain ID used by Biose",
              TRUE ~ "???"
            ),
          print_label = 
            case_when(
              name == "taxon_id" ~ "Taxon ID",
              name == "taxon_label" ~ "Taxon",
              name == "taxon_category" ~ "Category",
              name == "full_taxonomy" ~ "Taxonomy",
              name == "LBP" ~ "LBP",
              name == "strain_origin" ~ "Strain origin",
              name == "biose_id" ~ "Biose ID",
              TRUE ~ description
            ),
          table = "rowData"
        )
    )
    
  # Harmonization of the order of samples and feature
  
  ordered_mg_uids <- se_coldata$mg_uid
  assay_count <- assay_count[, ordered_mg_uids] |> set_colnames(se_coldata$uid)
  assay_count_corr <- assay_count_corr[, ordered_mg_uids] |> set_colnames(se_coldata$uid)
  assay_relative_ab <- assay_relative_ab[, ordered_mg_uids] |> set_colnames(se_coldata$uid)
  se_coldata <- se_coldata |> set_rownames(se_coldata$uid)

  ordered_taxa <- se_rowdata$taxon_id
  assay_count <- assay_count[ordered_taxa, ]
  assay_count_corr <- assay_count_corr[ordered_taxa, ]
  assay_relative_ab <- assay_relative_ab[ordered_taxa, ]
  
  SummarizedExperiment::SummarizedExperiment(
    assays = list(count = assay_count, count_corr = assay_count_corr, rel_ab = assay_relative_ab),
    rowData = se_rowdata,
    colData = se_coldata,
    metadata = list(
      name = "VIBRANT metagenomics taxonomic counts and relative abundances",
      date = today(),
      dictionary = dictionary
    )
  )
}
Code
SE_mg <- 
  mg_to_SE(
    counts = counts, counts_corr = counts_corr, relabs = relabs, 
    technical_metadata_agg = technical_metadata_agg, dictionary = dictionary,
    VIRGO2_taxonomic_table = VIRGO2_taxonomic_table, LBP_strain_info = LBP_strain_info
    )
Code
SE_mg <- 
  SE_mg |> 
  mutate(
    location = 
      case_when(
        str_detect(pid, "06810") ~ "US",
        str_detect(pid, "06820") ~ "SA",
        TRUE ~ NA_character_
      )
  )
Code
SE_mg
# A SummarizedExperiment-tibble abstraction: 775,884 × 58
# Features=779 | Samples=996 | Assays=count, count_corr, rel_ab
   .feature .sample        count count_corr rel_ab uid        mg_uid sample_type
   <chr>    <chr>          <dbl>      <dbl>  <dbl> <chr>      <chr>  <fct>      
 1 C0022A1  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 2 C0059E1  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 3 C0175A1  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 4 FF00018  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 5 FF00051  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 6 UC101    068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 7 122010   068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 8 185329   068100004_0000     0          0      0 068100004… MG_202 Clinical s…
 9 C0006A1  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
10 C0028A1  068100004_0000     0          0      0 068100004… MG_202 Clinical s…
# ℹ 40 more rows
# ℹ 50 more variables: control_type <fct>, total_non_human_reads <dbl>,
#   poor_quality_mg_data <lgl>, cst <chr>, sub_cst <chr>, valencia_score <dbl>,
#   swab_barcode <chr>, pid <chr>, mg_pid <chr>, visit_code <chr>,
#   mg_visit_code <chr>, visit_code_flagged <lgl>, mg_sample_type <chr>,
#   ext_lib_plate_nb <dbl>, ext_lib_plate_id <dbl>, ext_lib_position <chr>,
#   sequencing_sample_id <chr>, library <chr>, fragment_size <dbl>, …

Exploratory & QC analyses

Total number of counts/relative abundances per sample

Total number of non-human reads per sample

Code
get_var_print_label <- function(SE, var = "poor_quality_mg_data"){
  var_label <- 
    SE@metadata$dictionary |> 
    dplyr::filter(name == var) |> 
    pull(print_label)
  
  if (is.na(var_label)) var_label <- var |> str_replace_all("_", " ") |> str_to_sentence()
  
  var_label
}
Code
plot_total_non_human_reads <- function(SE_mg, color_by = "poor_quality_mg_data"){
  
  if (is.logical(SE_mg@colData[, color_by])) {
    color_breaks = c(FALSE, TRUE)
    color_values = c("steelblue2", "red2")
    color_labels = c("No", "Yes")
  }
  else {
    color_breaks = SE_mg@colData[, color_by] |> unique()
    color_values = RColorBrewer::brewer.pal(n = length(color_breaks), name = "Set2")
    color_labels = color_breaks
  }
  
  SE_mg |> 
    colData() |> 
    as_tibble() |>
    ggplot() +
    aes(x = total_non_human_reads, fill = !!sym(color_by)) +
    geom_vline(xintercept = 500000, linetype = "dashed", color = "black") +
    geom_histogram(bins = 50) +
    scale_x_log10() + 
    scale_fill_manual(get_var_print_label(SE_mg, var = color_by), breaks = color_breaks, values = color_values, labels = color_labels) +
    facet_grid(sample_type ~ ., scale = "free") +
    labs(
      x = "Total number of non-human reads per sample", 
      y = "Number of samples",
      caption = "Samples with less than 500,000 non-human reads (dashed line) should be considered with caution."
    ) +
    theme(
      strip.text.y = element_text(angle = 0)
    )
}
Code
SE_mg <- 
  SE_mg |> 
  mutate(
     reextraction_status = 
      case_when(
        selected_for_re_extraction & re_extracted_data_used ~ "Re-extracted data used",
        selected_for_re_extraction & !re_extracted_data_used ~ "Re-extracted data not used",
        !selected_for_re_extraction ~ "Not re-extracted",
        TRUE ~ "???"
      )
  )

SE_mg@metadata$dictionary <- 
  SE_mg@metadata$dictionary |> 
  bind_rows(
    tibble(
      name = "reextraction_status",
      description = "Re-extraction status of the sample (if selected for re-extraction, whether the re-extracted data was used or not).",
      print_label = "Re-extraction status"
    )
  ) |> 
  distinct()
Code
plot_total_non_human_reads(SE_mg, color_by = "poor_quality_mg_data")

Code
plot_total_non_human_reads(SE_mg, color_by = "selected_for_re_extraction")

Code
plot_total_non_human_reads(SE_mg, color_by = "reextraction_status")

Code
plot_total_non_human_reads(SE_mg, color_by = "library_sequenced_twice")

Code
SE_mg |> 
  colData() |> 
  as_tibble()  |> 
  arrange(total_non_human_reads) |> 
  mutate(index = row_number()) |>
  select(index, uid, poor_quality_mg_data, total_non_human_reads, sample_type, control_type, library_sequenced_twice, selected_for_re_extraction, sub_cst) |> 
  dplyr::filter(total_non_human_reads < 1e6) |>
  gt(caption = "Samples with less than 1 million non-human reads") 
Samples with less than 1 million non-human reads
index uid poor_quality_mg_data total_non_human_reads sample_type control_type library_sequenced_twice selected_for_re_extraction sub_cst
1 EQ05822471 FALSE 0 Negative control Unused swab + C2 FALSE FALSE UnAssigned
2 EQ05822476 FALSE 0 Negative control Nuclease-free water FALSE FALSE UnAssigned
3 EQ05822493 FALSE 3796 Negative control Unused swab + C2 FALSE FALSE IV-A
4 EQ05822499 FALSE 5104 Negative control Nuclease-free water FALSE FALSE IV-B
5 EQ05822485 FALSE 13074 Negative control Unused swab + C2 FALSE FALSE IV-C0
6 EQ05822501 FALSE 28095 Negative control Nuclease-free water FALSE FALSE III-B
7 EQ05822491 FALSE 37244 Negative control Nuclease-free water FALSE FALSE IV-B
8 068100026_2120 FALSE 70014 Clinical sample FALSE FALSE III-B
9 068200050_1900 FALSE 83473 Clinical sample FALSE TRUE IV-C0
10 068200050_1100 TRUE 96738 Clinical sample FALSE FALSE IV-B
11 EQ05822500 FALSE 103072 Negative control Unused swab + C2 FALSE FALSE III-B
12 068200422_2120 FALSE 112481 Clinical sample FALSE TRUE IV-B
13 068200113_0000 FALSE 133557 Clinical sample FALSE TRUE IV-B
14 068200087_1200 TRUE 163064 Clinical sample FALSE FALSE IV-A
15 068200361_1300 FALSE 174025 Clinical sample FALSE TRUE IV-B
16 068200361_1900 FALSE 183330 Clinical sample TRUE TRUE IV-B
17 068200240_1400 TRUE 183852 Clinical sample FALSE FALSE IV-B
18 068100061_1100 FALSE 276731 Clinical sample TRUE TRUE III-B
19 068100014_0000 FALSE 283167 Clinical sample FALSE TRUE III-A
20 068200062_1700 FALSE 336795 Clinical sample TRUE TRUE III-B
21 068100062_2120 FALSE 360084 Clinical sample FALSE FALSE III-B
22 068100036_1200 FALSE 371029 Clinical sample FALSE TRUE IV-B
23 068100052_1000 FALSE 426063 Clinical sample FALSE TRUE III-B
24 068200387_2120 FALSE 433160 Clinical sample FALSE FALSE III-A
25 068200204_1900 FALSE 459525 Clinical sample FALSE TRUE IV-C1
26 068100049_1100 FALSE 463533 Clinical sample FALSE TRUE III-B
27 068200414_1500 FALSE 469865 Clinical sample FALSE FALSE IV-B
28 068200399_1200 FALSE 502682 Clinical sample FALSE TRUE III-B
29 068100006_1400 FALSE 503866 Clinical sample FALSE TRUE I-B
30 068200399_1300 FALSE 559171 Clinical sample FALSE TRUE III-B
31 068100062_1100 FALSE 559941 Clinical sample FALSE TRUE IV-B
32 068200361_1400 FALSE 653137 Clinical sample FALSE FALSE IV-B
33 068200204_2120 FALSE 655502 Clinical sample FALSE TRUE I-B
34 068100029_1100 FALSE 685355 Clinical sample FALSE TRUE IV-B
35 EQ05822202 FALSE 716766 Negative control Unused swab + C2 FALSE FALSE IV-A
36 068200350_1100 FALSE 728568 Clinical sample FALSE TRUE III-A
37 068200285_0000 FALSE 731985 Clinical sample FALSE TRUE IV-B
38 068100049_1000 FALSE 774826 Clinical sample FALSE FALSE IV-B
39 068200285_1300 FALSE 792563 Clinical sample FALSE TRUE I-B
40 068100061_1400 FALSE 814658 Clinical sample TRUE FALSE I-A
41 068200399_1900 FALSE 820251 Clinical sample FALSE TRUE IV-B
42 068200239_1900 FALSE 822427 Clinical sample FALSE TRUE IV-B
43 068200375_2120 FALSE 826468 Clinical sample FALSE FALSE III-A
44 068100029_2120 FALSE 830458 Clinical sample FALSE FALSE III-A
45 068200127_1000 FALSE 832821 Clinical sample TRUE TRUE IV-C1
46 EQ05822427 FALSE 852172 Negative control Nuclease-free water FALSE FALSE I-B
47 068100049_1400 FALSE 860927 Clinical sample FALSE FALSE III-B
48 EQ05822433 FALSE 870398 Negative control Unused swab + C2 FALSE FALSE I-B
49 068200259_0000 FALSE 877864 Clinical sample FALSE FALSE IV-B
50 068100026_1100 FALSE 885324 Clinical sample FALSE FALSE III-B
51 068100053_1300 FALSE 892108 Clinical sample TRUE FALSE III-A
52 EQ05822226 FALSE 898191 Negative control Nuclease-free water FALSE FALSE IV-A
53 068200388_1200 FALSE 921074 Clinical sample FALSE TRUE IV-B
54 EQ05822481 FALSE 930793 Positive control Mock 1 FALSE FALSE IV-A
55 068200247_1100 FALSE 959547 Clinical sample TRUE FALSE IV-B
56 068100062_0000 FALSE 966135 Clinical sample FALSE FALSE IV-B
57 068200023_1100 FALSE 998430 Clinical sample FALSE TRUE IV-B
Code
plot_total_non_human_reads(SE_mg, color_by = "selected_for_re_extraction") +
  facet_grid(visit_code ~ ., scale = "free")

Total relative abundances per sample

We check that the relative abundances sum to 1 for each sample.

Code
tol <- 1e-5

SE_mg |> 
  as_tibble() |> 
  group_by(.sample) |>
  summarise(total_rel_ab = sum(rel_ab)) |> 
  ggplot(aes(x = total_rel_ab)) +
  geom_histogram(binwidth = tol) + 
  labs(x = "Total relative abundances per sample", 
       y = "Number of samples") +
  scale_x_continuous(limits = 1 + c(-1, 1) * 5 * tol) 

Code
rm(tol)
Code
# We check the relative abundances computation 

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  group_by(.sample) |>
  mutate(rel_ab_check = count_corr/sum(count_corr)) |> 
  ungroup() 

if (max(tmp$rel_ab_check - tmp$rel_ab, na.rm = TRUE) > 0.01) warning("!! Relative abundances do not match the computed values !!")

Non-bacterial DNA

There are some non-bacterial taxa in the dataset:

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(taxon_category != "Bacteria") |> 
  group_by(.sample, taxon_category) |> 
  summarize(rel_ab = sum(rel_ab), .groups = "drop") |> 
  ggplot() +
  aes(x = rel_ab, fill = taxon_category) +
  geom_histogram(binwidth = 0.001) +
  facet_wrap(~ taxon_category, scales = "free") +
  scale_x_continuous("Total relative abundance per sample for that group", labels = scales::percent_format()) +
  ylab("Number of samples") +
  guides(fill = "none")

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(taxon_category != "Bacteria") |> 
  group_by(.sample) |>
  mutate(total_rel_ab = sum(rel_ab)) |>
  ungroup() |> 
  arrange(-total_rel_ab, -rel_ab) |> 
  mutate(
    .sample = .sample |> fct_inorder(),
    taxon_label = taxon_label |> fct_inorder()
  ) |> 
  dplyr::filter(as.numeric(.sample) <= 20) |>
  ggplot() +
  aes(x = rel_ab, y = .sample |> fct_rev(), fill = taxon_category) +
  geom_col() +
  scale_x_continuous("Relative abundance", labels = scales::percent_format()) +
  ylab("Top 20 samples with most non-bacterial relative abundances") 

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(taxon_category != "Bacteria") |> 
  group_by(taxon_label) |>
  mutate(total_rel_ab = sum(rel_ab), max_rel_ab = max(rel_ab)) |>
  ungroup() |> 
  arrange(-total_rel_ab, -rel_ab) |> 
  mutate(
    .sample = .sample |> fct_inorder(),
    taxon_label = taxon_label |> fct_inorder()
  ) |> 
  dplyr::filter(as.numeric(taxon_label) <= 20, max_rel_ab > 0.001) |>
  ggplot() +
  aes(x = rel_ab, y = taxon_label |> fct_rev(), color = taxon_category, shape = poor_quality_mg_data) +
  geom_point(alpha = 0.3, size = 2) +
  scale_x_continuous("Relative abundance", labels = scales::percent_format()) +
  ylab("Top non-bacterial organisms") +
  theme(strip.text.y = element_text(angle = 0)) +
  labs(caption = "Each dot is a sample.", color = "Category", shape = "Poor quality MG data")

Consequently, we create a new assay which contains the relative abundances of the bacteria only.

Code
SE_mg <- 
  SE_mg |>
  mutate(
    rel_ab_bact = rel_ab * (!is.na(taxon_category) & (taxon_category == "Bacteria"))
  )

assay(SE_mg, "rel_ab_bact") <- t(t(assay(SE_mg, "rel_ab_bact"))/colSums(assay(SE_mg, "rel_ab_bact")))

“MultiGenera” content

Some VIRGO2 genes cannot be assigned to a single taxon, and are thus assigned to the “MultiGenera” taxon.

When building the taxonomic composition tables, the counts/relative abundances of these genes are agglomerated into a single “MultiGenera” taxon.

The distribution of the relative abundances of this “MultiGenera” taxon is shown below.

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(.feature == "MultiGenera") |> 
  ggplot(aes(x = rel_ab)) +
  geom_histogram(binwidth = 0.005) +
  labs(x = 'Relative abundance of "MultiGenera" per sample', 
       y = "Number of samples") +
  scale_x_continuous(labels = scales::percent_format())

Important

For downstream analyses, we make the assumption that all taxa present in the samples equally contributes to these MultiGenera genes. Consequently, we estimate the relative abundance of each taxa in a sample solely from the genes uniquely assigned to these taxa (i.e., excluding the MultiGenera contribution).

This assumption may not always hold and the uncertainty on the relative abundances will be higher in samples with higher proportions of MultiGenera genes.

We re-name the rel_ab assay to rel_ab_all to reflect that it contains the relative abundances of every “detected/assigned” micro-organisms and create a new assay (rel_ab) that contains the relative abundances of the bacteria only, excluding the MultiGenera contribution.

Code
# we rename the `rel_ab` assay to `rel_ab_all`
SE_mg <- SE_mg |> mutate(rel_ab_all = rel_ab) 
assays(SE_mg) <- assays(SE_mg)[names(assays(SE_mg)) != "rel_ab"]
Code
# we create a new assay `rel_ab` that contains the relative abundances of the non-MultiGenera bacteria only
SE_mg <- SE_mg |> mutate(rel_ab = rel_ab_bact * (.feature != "MultiGenera"))
assay(SE_mg, "rel_ab") <- t(t(assay(SE_mg, "rel_ab"))/colSums(assay(SE_mg, "rel_ab")))
Code
# we re-order the assays
assays(SE_mg) <- assays(SE_mg)[c("count", "count_corr",  "rel_ab_all", "rel_ab_bact", "rel_ab")]

And since the MultiGenera relative abundance is informative for quantifying uncertainty in samples (just like the total non-human reads), we add it to the colData of the SE_mg object.)

Code
SE_mg$rel_ab_multi_genera <- 
  SE_mg |> 
  colData() |> 
  as_tibble() |> 
  mutate(.sample = uid) |> 
  select(.sample) |> 
  dplyr::left_join(
    SE_mg |> 
      as_tibble() |> 
      dplyr::filter(.feature == "MultiGenera") |> 
      select(.feature, .sample, rel_ab_bact) |> 
      mutate(
        rel_ab_multi_genera = rel_ab_bact
      ) |> 
      select(.sample, rel_ab_multi_genera),
    by = join_by(.sample)
  ) |> 
  pull(rel_ab_multi_genera)

colData(SE_mg) <- 
  colData(SE_mg) |> 
  as.data.frame() |> 
  relocate(
    rel_ab_multi_genera, .after = total_non_human_reads
  ) |> 
  DataFrame()

SE_mg@metadata$dictionary <-
  SE_mg@metadata$dictionary |> 
  bind_rows(
    tibble(
      name = "rel_ab_multi_genera",
      description = "Total relative abundance of MultiGenera genes in the sample (as a fraction of the total bacterial reads).",
      print_label = "MultiGenera rel. ab."
    )
  ) |> 
  distinct()

Control samples

There are 4 categories of control samples:

Code
SE_mg |> 
  colData() |> 
  as_tibble() |> 
  dplyr::filter(!str_detect(sample_type, "Clinical sample")) |>
  select(control_type, uid, ext_lib_plate_nb) |> 
  group_by(control_type) |> 
  arrange(control_type, ext_lib_plate_nb) |>
  gt(caption = "Control samples", row_group_as_column = TRUE) 
Control samples
uid ext_lib_plate_nb
Mock 1 EQ05822515 1
EQ05822481 5
EQ05822509 8
EQ05822503 10
EQ05822445 11
EQ05822469 NA
MC1 NA
Mock 2 EQ05822227 1
EQ05822473 3
EQ05822497 10
EQ05822439 11
Nuclease-free water EQ05822226 1
EQ05822476 3
EQ05822499 5
EQ05822501 8
EQ05822491 10
EQ05822427 NA
Unused swab + C2 EQ05822202 1
EQ05822493 5
EQ05822500 8
EQ05822485 10
EQ05822433 NA
EQ05822471 NA

These control samples have been created such that

  • Mock 1: pooled patient samples from CAP084 swabs

    • 50 non-BV
      • VMRC 15 L. crispatus isolates
      • 12 L. crispatus dominant samples
      • 23 L. iners dominant samples
    • 50 BV
      • 24 G. vaginalis dominant samples
      • BV bacteria, not G. vaginalis dominant
  • Mock 2: Pooled pure isolates (equal volumes of pure isolates at OD 0.1)

    • VMRC 15 L. crispatus isolates
    • G. vaginalis ATCC
    • A. vaginae ATCC
    • P. bivia ATCC
    • L. crispatus ATCC
    • L. gasseri ATCC
    • S. agalactiae ATCC
    • L. jensenii ATCC
    • F. magna

Taxonomic composition of these control samples:

Code
control_types <- SE_mg$control_type[!is.na(SE_mg$control_type) & (SE_mg$control_type != "")] |> unique() |> sort()
Code
map(
  control_types,
  ~ {
    tmp <- 
      SE_mg |> 
      as_tibble() |> 
      dplyr::filter(control_type == .x) |> 
      dplyr::filter(rel_ab > 0) 
    top_taxa <- 
      tmp |> 
      group_by(taxon_label) |> 
      summarise(total_re_lab = sum(rel_ab)) |> 
      arrange(desc(total_re_lab)) |> 
      slice_head(n = 25) |> 
      pull(taxon_label) |> sort()
    
    all_taxa <- tmp$taxon_label |> unique() |> sort()
    
    tmp <- 
      tmp |> 
      mutate(
        sample_label = str_c(.sample, ifelse(selected_for_re_extraction, " (re-x)", ""))
      ) 
    
    tmp |> 
      ggplot() +
      aes(x = sample_label) +
      facet_grid(. ~ str_c("Plate ", ext_lib_plate_nb), scales = "free", space = "free") +
      geom_col(aes(y = rel_ab, fill = taxon_label), color = "black", linewidth = 0.1) +
      geom_point(
        data = 
          tmp |> 
          select(sample_label, ext_lib_plate_nb, total_non_human_reads, rel_ab_multi_genera, poor_quality_mg_data) |> 
          distinct(), 
        aes(y = 1.1, size = log10(total_non_human_reads), alpha = rel_ab_multi_genera)
      ) +
      scale_x_discrete("Samples") +
      scale_y_continuous("Rel. ab.", breaks = seq(0, 1, by = 1/4), minor_breaks = seq(0, 1, by = 0.05)) +
      scale_fill_manual(
        "Top taxa", limits = top_taxa, breaks = all_taxa, values = get_taxa_colors(all_taxa), #, limits = all_taxa, values = get_taxa_colors(all_taxa),
        guide = guide_legend(ncol = 3)
        ) + 
      scale_alpha_continuous(
        "Relative abundance of MultiGenera genes",
        range = c(1, 0), limits = c(0, 0.4),
        guide = guide_legend(direction = "horizontal")
      ) +
      scale_size_continuous(
        "Log10 of total non-human reads",
        range = c(0.5, 5), limits = c(3, 9),
        guide = guide_legend(direction = "horizontal")
      ) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      labs(title = str_c("Control samples : ", .x))
  }
)
[[1]]


[[2]]


[[3]]


[[4]]

Mock 1

For Mock 1 samples, the relative abundances are not very replicable across plates, but that seems to correlate with lower total number of non-human reads.

We check below if detection (presence/absence) is better and better correlate within Mocks than across Mocks.

Code
tmp <- 
  SE_mg |> 
  as_tibble() |> 
  dplyr::filter(str_detect(control_type, "Mock")) |> 
  mutate(present = (rel_ab > 1/1000)) |> 
  group_by(control_type, taxon_label) |>
  mutate(tot_prop = sum(rel_ab)) |> 
  ungroup() |> 
  arrange(control_type, -tot_prop) |> 
  mutate(taxon_label = taxon_label |> fct_inorder()) 
Code
tmp |> 
  ggplot() +
  aes(x = taxon_label, y = .sample, fill = rel_ab |> log10()) +
  geom_tile() +
  ylab("") +
  scale_x_discrete("Taxa", breaks = NULL) +
  scale_fill_gradient(low = "gray99", high = "red", na.value = "white") +
  facet_grid(control_type ~ ., scales = "free", space = "free")

It looks reasonable…

We also check if there were any LBP strains detected in the Mock 1 samples (their expected relative abundance should be around ~ 1/100).

In Mock 2, their relative abundance should be around ~ 1/24.

Code
tmp |> 
  dplyr::filter(str_detect(taxon_label, "LBP")) |>
  mutate(
    exp_rel_ab = 
      case_when(
        control_type == "Mock 1" ~ 1/100,
        control_type == "Mock 2" ~ 1/24,
        TRUE ~ NA_real_
      )
  ) |> 
  ggplot() +
  aes(x = .sample, y = rel_ab, col = .feature) +
  facet_grid(. ~ control_type, scales = "free", space = "free") +
  geom_hline(aes(yintercept = exp_rel_ab), linetype = "dashed", color = "black") +
  geom_point(alpha = 0.5, size = 3) +
  xlab("Barcode") +
  scale_y_continuous(
    "Relative abundance", 
    labels = scales::percent_format(), 
    breaks = seq(0, 1, by = 0.01)
    ) +
  scale_color_discrete("LBP strain") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

The relative abundances in Mock 2 are around what is expected, but LBP strains are not detected in Mock 1.

Mock 2

In the Mock 2, we check for the proportion of species/taxa that were not expected based on the “theoretical” composition of Mock 2

Code
# Focus on Mock2

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  dplyr::filter(control_type == "Mock 2") |> 
  dplyr::filter(rel_ab > 0) |> 
  mutate(
    expected = 
      str_detect(taxon_label, "LBP") | 
      str_detect(taxon_label, "Gardnerella") |
      str_detect(taxon_label, "Fannyhessea") |
      str_detect(taxon_label, "bivia") |
      str_detect(taxon_label, "crispatus") |
      str_detect(taxon_label, "gasseri") |
      str_detect(taxon_label, "jensenii") |
      str_detect(taxon_label, "Streptococcus agalactiae") |
      str_detect(taxon_label, "Finegoldia magna") 
  )

tmp |> 
  ggplot() +
  aes(y = rel_ab, x = .sample, fill = expected) +
  geom_col(linewidth = 0.1) + # color = "black", 
  scale_x_discrete("Samples") +
  scale_y_continuous("Rel. ab.") + 
  facet_grid(. ~ str_c("Plate ", ext_lib_plate_nb), scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
tmp |> 
  group_by(.sample, ext_lib_plate_nb, expected) |> 
  summarize(tot_rel_ab_pc = round(100 * sum(rel_ab), 2), .groups = "drop") |>
  dplyr::filter(!expected) |> 
  gt(caption = "Percent of relative abundance with unexpected taxa")
Percent of relative abundance with unexpected taxa
.sample ext_lib_plate_nb expected tot_rel_ab_pc
EQ05822227 1 FALSE 2.18
EQ05822439 11 FALSE 2.38
EQ05822473 3 FALSE 1.76
EQ05822497 10 FALSE 2.26

When we also consider the MultiGenera genes, results look worse:

Code
# Focus on Mock2

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  dplyr::filter(control_type == "Mock 2") |> 
  dplyr::filter(rel_ab_bact > 0) |> 
  mutate(
    expected = 
      str_detect(taxon_label, "LBP") | 
      str_detect(taxon_label, "Gardnerella") |
      str_detect(taxon_label, "Fannyhessea") |
      str_detect(taxon_label, "bivia") |
      str_detect(taxon_label, "crispatus") |
      str_detect(taxon_label, "gasseri") |
      str_detect(taxon_label, "jensenii") |
      str_detect(taxon_label, "Streptococcus agalactiae") |
      str_detect(taxon_label, "Finegoldia magna") 
  )

tmp |> 
  ggplot() +
  aes(y = rel_ab_bact, x = .sample, fill = expected) +
  geom_col(linewidth = 0.1) + # color = "black", 
  scale_x_discrete("Samples") +
  scale_y_continuous("Rel. ab. (of bacterial content)") + 
  facet_grid(. ~ str_c("Plate ", ext_lib_plate_nb), scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
tmp |> 
  group_by(.sample, ext_lib_plate_nb, expected) |> 
  summarize(tot_rel_ab_pc = round(100 * sum(rel_ab_bact), 2), .groups = "drop") |>
  dplyr::filter(!expected) |> 
  gt(caption = "Percent of relative abundance with unexpected taxa")
Percent of relative abundance with unexpected taxa
.sample ext_lib_plate_nb expected tot_rel_ab_pc
EQ05822227 1 FALSE 4.52
EQ05822439 11 FALSE 5.08
EQ05822473 3 FALSE 3.31
EQ05822497 10 FALSE 4.97

However, when examining the list of unexpected taxa, it seems that the “unexpectedness” can likely be attributed to shared genes that were assigned to a single taxa in VIRGO2:

Code
tmp |> 
  dplyr::filter(!expected) |> 
  arrange(.sample, -rel_ab_bact) |> 
  group_by(.sample) |> 
  slice_head(n = 20) |> 
  ungroup() |> 
  arrange(-rel_ab_bact) |> 
  mutate(
    taxon_label = taxon_label |> fct_inorder() |> fct_rev(),
    expected = 
      case_when(
        str_detect(taxon_label, "actobacillus|Prevotella|Streptococcus|Finegoldia|MultiGenera") ~ "Likely shared genes",
        TRUE ~ "??? contamination"
      )
    ) |>
  ggplot() +
  aes(x = .sample, y = taxon_label, size = rel_ab_bact, col = expected) +
  geom_point() +
  scale_y_discrete("Top 20 unexpected taxa") +
  scale_color_manual("", values = c("red", "steelblue1")) +
  xlab("Sample") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Alpha-diversity of clinical samples and controls

Code
vegan::diversity(SE_mg |> assay("rel_ab"), index = "shannon", MARGIN = 2) |> 
  as_tibble() |> 
  mutate(uid = colnames(SE_mg)) |> 
  dplyr::left_join(SE_mg |> colData() |> as_tibble(), by = join_by(uid)) |> 
  ggplot() +
  aes(x = control_type, y = value |> exp(), col = sample_type, fill = sample_type) +
  facet_grid(. ~ sample_type |> str_wrap(width = 20), scales = "free", space = "free") +
  geom_violin(scale = "width", color = NA, alpha = 0.2) +
  ggbeeswarm::geom_quasirandom(size = 0.5, alpha = 0.5) +
  scale_y_log10() +
  labs(
    x = "",
    y = "Effective number of species\nbased on shannon diversity index"
  ) +
  guides(fill = "none", color = "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Proportion of LBP strains per sample

We saw above that the LBP strains were detected (as they should have) in the Mock 2 samples.

In this section, we further document the detection and relative abundances of the LBP strains.

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(!is.na(LBP)) |> 
  ggplot() +
  aes(x = rel_ab, fill = (rel_ab == 0)) +
  geom_histogram(binwidth = 0.01) +
  scale_fill_manual("", values = c("steelblue", "gray"), labels = c("Rel. ab = 0", "Rel. ab > 0")) +
  xlab("Relative abundance of LBP strains") +
  ylab("Number of samples x LBP strain")

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(!is.na(LBP), rel_ab > 0) |> 
  ggplot() +
  aes(x = rel_ab) +
  geom_histogram(binwidth = 0.001) +
  xlab("Non-zero relative abundance of LBP strains") +
  ylab("Number of samples") +
  facet_grid(LBP + .feature ~ .) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
SE_mg |> 
  as_tibble() |> 
  dplyr::filter(!is.na(LBP), rel_ab > 0) |>
  ggplot() +
  aes(x = .feature) +
  geom_bar() +
  scale_x_discrete("LBP strains") +
  scale_y_continuous("Number of samples with non-zero relative abundances") +
  facet_grid(. ~ LBP, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Code
number_of_non_zero_LBP_strains_per_sample <- 
  SE_mg |> 
  as_tibble() |>  
  dplyr::filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    number_of_non_zero_LBP_strains = sum(rel_ab > 0),
  ) 

number_of_non_zero_LBP_strains_per_sample |> 
  ggplot() +
  aes(x = number_of_non_zero_LBP_strains) +
  geom_histogram(binwidth = 0.5) +
  scale_x_continuous("Number of non-zero LBP strains per sample", breaks = 0:15, minor_breaks = 0) +
  ylab("Number of samples") 

There are 234 samples (23.49%) with at least one LBP strain detected (non-zero relative abundance).

Code
SE_mg |> 
  as_tibble() |>  
  dplyr::filter(!is.na(LBP)) |> 
  dplyr::left_join(
    number_of_non_zero_LBP_strains_per_sample,
    by = join_by(.sample)
  ) |>
  dplyr::filter(number_of_non_zero_LBP_strains != 0) |>
  group_by(.feature) |>
  mutate(tot_rel_abs_for_strain = sum(rel_ab)) |> 
  ungroup() |> 
  arrange(LBP, -tot_rel_abs_for_strain) |>
  mutate(.feature = .feature |> fct_inorder()) |> 
  group_by(.sample) |> 
  mutate(
    LBP_rel_abs = rel_ab / sum(rel_ab),
    score = weighted.mean(LBP_rel_abs, .feature |> as.numeric()),
    total_LBP_rel_abs = sum(rel_ab)
  ) |> 
  ungroup() |> 
  arrange(score) |> 
  mutate(.sample = .sample |> fct_inorder()) |> 
  ggplot() +
  aes(x = .feature, y = .sample, fill = rel_ab) +
  geom_tile() +
  scale_fill_continuous("Rel. ab", low = "white", high = "steelblue2") +
  scale_x_discrete("Strains") +
  scale_y_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ LBP + strain_origin, scales = "free_x", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(
    caption = "Rel. ab. of LBP strains in samples with at least one LBP strain detected (non-zero)",
  )

Compositions

Code
# creating a taxa category "other"
summarized_rel_ab <- 
  SE_mg |> 
  as_tibble() |> 
  group_by(.feature) |> 
  mutate(max_rel_ab = max(rel_ab, na.rm = TRUE), mean_rel_ab = mean(rel_ab, na.rm = TRUE)) |> 
  ungroup() |> 
  mutate(
    taxa = 
      case_when(
        !is.na(LBP) ~ taxon_label,
        max_rel_ab > 1/3 ~ taxon_label,
        TRUE ~ "Other"
      ),
    is_lacto = str_detect(genus, "Lactobacillus") & (taxa != "Other")
  ) |> 
  arrange(LBP, is_lacto, desc(max_rel_ab)) |>
  mutate(
    taxa = taxa |> fct_inorder()
  ) |> 
  group_by(.sample, sample_type, control_type, sample_category, poor_quality_mg_data, location, pid, visit_code, taxa, LBP, is_lacto, strain_origin) |>
  summarize(rel_ab = sum(rel_ab), .groups = "drop") |>
  arrange(.sample, -rel_ab) |> 
  group_by(.sample) |>
  mutate(
    score = weighted.mean(taxa |> as.numeric(), rel_ab), 
    tot = sum(rel_ab),
    dom = taxa[1]
    ) |> 
  ungroup() |> 
  arrange(score) |> 
  mutate(.sample = .sample |> fct_inorder())
Code
summarized_rel_ab |> 
  ggplot() +
  aes(x = .sample, y = rel_ab, fill = taxa) +
  geom_col() +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ sample_type + str_wrap(sample_category, 50), scales = "free", space = "free") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_ab$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 90, hjust = 0),
    legend.position = "bottom",
  ) +
  ggtitle("Taxonomic composition of all samples")

Code
summarized_rel_ab |> 
  dplyr::filter(sample_category != "Expected sample") |> 
  ggplot() +
  aes(x = .sample, y = rel_ab, fill = taxa) +
  geom_col() +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ str_c(sample_type,"\n", control_type) + str_wrap(sample_category, 20), scales = "free", space = "free") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_ab$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 90, hjust = 0),
    legend.position = "bottom",
  ) +
  ggtitle('"Unexpected" samples')

Code
summarized_rel_ab |> 
  dplyr::filter(sample_type == "Clinical sample") |> # 
  ggplot() +
  aes(x = pid, y = rel_ab, fill = taxa, alpha = poor_quality_mg_data) +
  facet_grid(visit_code ~ location + sample_category, scales = "free_x", space = "free_x") +
  geom_col() +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  scale_alpha_discrete("Poor quality MG data", range = c(1, 0.5)) +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_ab$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "bottom",
  ) +
  ggtitle("Longitudinal profiles for all clinical samples")

Code
summarized_rel_ab |> 
  dplyr::filter(sample_type == "Clinical sample", sample_category %in% c("Expected sample")) |> # 
  ggplot() +
  aes(x = pid, y = rel_ab, fill = taxa, alpha = poor_quality_mg_data) +
  geom_col() +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location, scales = "free_x", space = "free_x") +
  scale_alpha_discrete("Poor quality MG data", range = c(1, 0.5)) +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_ab$taxa |> levels()),
    guide = guide_legend(nrow = 5)
  ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "bottom",
  ) +
  ggtitle('Longitudinal profiles for all "expected" clinical samples')

Code
summarized_rel_ab |> 
  group_by(pid) |> 
  mutate(
    had_poor_quality_sample = any(poor_quality_mg_data)
  ) |> 
  ungroup() |> 
  dplyr::filter(sample_type == "Clinical sample", had_poor_quality_sample) |> # 
  ggplot() +
  aes(x = visit_code, y = rel_ab, fill = taxa, alpha = poor_quality_mg_data) +
  geom_col() +
  scale_x_discrete("Visits") + # , breaks = NULL) +
  facet_grid(. ~ pid + location, scales = "free_x", space = "free_x") +
  scale_alpha_discrete("Poor quality MG data", range = c(1, 0.3)) +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_ab$taxa |> levels()),
    guide = guide_legend(nrow = 5)
  ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "bottom",
  ) +
  ggtitle('Longitudinal profiles for participant with poor quality MG data')

Code
g_comp <- 
summarized_rel_ab |> 
  dplyr::filter(sample_type == "Clinical sample", visit_code == "0000") |> # 
  group_by(.sample) |> mutate(any_LBP = sum(rel_ab[!is.na(LBP)]) > 0) |> ungroup() |> 
  dplyr::filter(any_LBP) |> 
  ggplot() +
  aes(x = pid, y = rel_ab, fill = taxa) +
  geom_col(color = "white") +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location + sample_category, scales = "free_x", space = "free_x") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_ab$taxa |> levels()),
    guide = guide_legend(nrow = 15)
    ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) 

g_origin <- 
  
summarized_rel_ab |> 
  dplyr::filter(sample_type == "Clinical sample", visit_code == "0000") |> # 
  group_by(.sample) |> mutate(any_LBP = sum(rel_ab[!is.na(LBP)]) > 0) |> ungroup() |> 
  dplyr::filter(any_LBP) |> 
  ggplot() +
  aes(x = pid, y = rel_ab, fill = strain_origin) +
  geom_col(color = "white") +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location + sample_category, scales = "free_x", space = "free_x") +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) 

g_comp + g_origin + plot_layout(nrow = 1, widths = c(2, 1)) +
  plot_annotation(title = "Participant(s) with LBP strains at screening visit")

PCoA on counts

Code
SE_mg <- 
  SE_mg |> 
  mutate(
    combined_sample_type = str_c(sample_type," (", control_type,")") |> str_remove(" \\(\\)")
  )
Code
dist_matrix <-
  vegdist(
    assay(SE_mg[, SE_mg$total_non_human_reads > 0], "count") |> t(), 
    method = "bray")

pcoa <- wcmdscale(dist_matrix, eig = TRUE)
print(pcoa)
Call: wcmdscale(d = dist_matrix, eig = TRUE)

          Inertia Rank
Total      364.11     
Real       429.40  359
Imaginary  -65.29  634

Results have 994 points, 359 axes

Eigenvalues:
  [1] 65.71 51.43 30.25 23.25 16.96 14.59 11.81 11.18  9.45  8.36  7.27  6.27
 [13]  5.85  5.61  5.11  4.90  4.82  4.41  4.19  3.97  3.81  3.77  3.49  3.39
 [25]  3.27  3.05  2.82  2.70  2.58  2.55  2.48  2.40  2.27  2.10  1.98  1.97
 [37]  1.91  1.85  1.77  1.76  1.69  1.67  1.63  1.51  1.45  1.38  1.37  1.34
 [49]  1.30  1.29  1.25  1.22  1.19  1.17  1.14  1.12  1.08  1.05  1.03  0.98
 [61]  0.95  0.94  0.93  0.89  0.88  0.87  0.84  0.84  0.82  0.81  0.78  0.77
 [73]  0.77  0.74  0.73  0.72  0.70  0.68  0.66  0.65  0.64  0.63  0.62  0.61
 [85]  0.59  0.58  0.57  0.56  0.55  0.55  0.53  0.53  0.51  0.50  0.50  0.48
 [97]  0.48  0.47  0.46  0.46  0.45  0.44  0.43  0.42  0.42  0.41  0.41  0.40
[109]  0.39  0.39  0.38  0.37  0.36  0.36  0.36  0.36  0.34  0.34  0.33  0.33
(Showing 120 of 993 eigenvalues)

Weights: Constant
Code
pcoa$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa$eig/sum(pcoa$eig)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 65.715 18.048
2 51.435 14.126
3 30.250 8.308
4 23.252 6.386
5 16.956 4.657
6 14.593 4.008
7 11.805 3.242
8 11.180 3.070
9 9.447 2.595
10 8.358 2.295
Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x = "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  dplyr::filter(axis <= 20) |> 
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x = "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_data <- 
  tibble(
    uid = rownames(pcoa$points),
    PCoA = pcoa$points
  ) |> 
  dplyr::left_join(
    SE_mg |> colData() |> as_tibble(),
    by = join_by(uid)
  )
Code
pcoa_plot <- function(pcoa_tb, pcoa, color_var, axes = 1:2){
  
  pcoa_tb <- 
    pcoa_tb |> 
    mutate(
      col = as.factor(!!sym(color_var)),
      x = PCoA[, axes[1]],
      y = PCoA[, axes[2]]
      )

  pcoa_tb |>
    ggplot() +
    aes(x = x, y = y, color = col) +
    geom_point(size = 2, alpha = 0.7) +
    coord_fixed() +
    labs(
      x = paste0("PCoA ", axes[1], " (", round(100 * pcoa$eig[axes[1]] / sum(pcoa$eig), 1), "%)"),
      y = paste0("PCoA ", axes[2], " (", round(100 * pcoa$eig[axes[2]] / sum(pcoa$eig), 1), "%)"),
      color = color_var
    ) 
}

Type of sample

Code
sample_type_colors <- c("gray","gray85", "dodgerblue", "black", "purple", "red")

pcoa_plot(pcoa_data, pcoa, "combined_sample_type", c(1, 2)) + scale_color_manual(values = sample_type_colors) 

Code
pcoa_plot(pcoa_data, pcoa, "combined_sample_type", c(3, 4)) + scale_color_manual(values = sample_type_colors) 

Code
manova <- adonis2(dist_matrix ~ pcoa_data$sample_type)
manova

Extraction Plate

Code
pcoa_plot(pcoa_data, pcoa, "ext_lib_plate_nb", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "ext_lib_plate_nb", c(3, 4))

Permutation test

Code
manova <- adonis2(dist_matrix ~ pcoa_data$ext_lib_plate_nb)
manova

Selected for re-extraction

Code
pcoa_plot(pcoa_data, pcoa, "selected_for_re_extraction", c(1, 2)) 

Code
pcoa_plot(pcoa_data, pcoa, "selected_for_re_extraction", c(3, 4)) 

Bioinformatics Processing Batch

Code
pcoa_plot(pcoa_data, pcoa, "bioinformatics_processing_batch", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "bioinformatics_processing_batch", c(3, 4))

Library Pool

Code
pcoa_plot(pcoa_data, pcoa, "library_pool", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "library_pool", c(3, 4))

Lane

Code
pcoa_plot(pcoa_data |> dplyr::filter(sequencing_lane %in% 1:4), pcoa, "sequencing_lane", c(1, 2))

Code
pcoa_plot(pcoa_data |> dplyr::filter(sequencing_lane %in% 1:4), pcoa, "sequencing_lane", c(3, 4))

PCoA on relative abundances

Code
dist_matrix_relab <-
  vegdist(
    assay(SE_mg[, SE_mg$total_non_human_reads > 0], "rel_ab") |> t(), 
    method = "bray")


pcoa_relab <- wcmdscale(dist_matrix_relab, eig = TRUE)
print(pcoa_relab)
Call: wcmdscale(d = dist_matrix_relab, eig = TRUE)

          Inertia Rank
Total      340.23     
Real       413.30  343
Imaginary  -73.08  650

Results have 994 points, 343 axes

Eigenvalues:
  [1] 75.97 61.84 41.55 20.14 16.38 12.87 11.42  9.51  8.48  7.93  6.88  6.54
 [13]  6.20  5.21  5.07  4.68  4.31  4.28  3.53  3.49  3.23  3.13  2.93  2.76
 [25]  2.55  2.42  2.32  2.18  1.99  1.91  1.79  1.73  1.64  1.53  1.47  1.44
 [37]  1.42  1.40  1.34  1.26  1.25  1.16  1.14  1.10  1.09  1.04  1.03  0.98
 [49]  0.97  0.94  0.92  0.87  0.85  0.84  0.81  0.81  0.78  0.75  0.75  0.74
 [61]  0.73  0.69  0.67  0.66  0.65  0.63  0.62  0.60  0.59  0.57  0.55  0.55
 [73]  0.53  0.53  0.52  0.50  0.49  0.48  0.47  0.46  0.46  0.45  0.45  0.44
 [85]  0.43  0.42  0.41  0.40  0.40  0.39  0.38  0.37  0.37  0.36  0.35  0.35
 [97]  0.34  0.33  0.33  0.32  0.31  0.31  0.30  0.30  0.29  0.29  0.28  0.28
[109]  0.27  0.26  0.26  0.26  0.26  0.25  0.25  0.24  0.24  0.24  0.24  0.22
(Showing 120 of 993 eigenvalues)

Weights: Constant
Code
pcoa_relab$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa_relab$eig/sum(pcoa_relab$eig)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa_relab$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa_relab$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 75.971 22.330
2 61.843 18.177
3 41.548 12.212
4 20.138 5.919
5 16.381 4.815
6 12.874 3.784
7 11.419 3.356
8 9.508 2.795
9 8.484 2.494
10 7.928 2.330
Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  dplyr::filter(axis <= 20) |> 
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_relab_data <- 
  tibble(
    uid = rownames(pcoa_relab$points),
    PCoA = pcoa_relab$points
  ) |> 
  dplyr::left_join(
    SE_mg |> colData() |> as_tibble(),
    by = join_by(uid)
  )

Type of sample

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "combined_sample_type", c(1, 2)) + scale_color_manual(values = sample_type_colors) 

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "combined_sample_type", c(3, 4)) + scale_color_manual(values = sample_type_colors) 

Code
manova <- adonis2(dist_matrix_relab ~ pcoa_relab_data$sample_type)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ pcoa_relab_data$sample_type)
          Df SumOfSqs      R2      F Pr(>F)    
Model      3     3.52 0.01034 3.4484  0.001 ***
Residual 990   336.71 0.98966                  
Total    993   340.23 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Extraction Plate

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "ext_lib_plate_nb", c(1, 2))

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "ext_lib_plate_nb", c(3, 4))

Code
manova <- adonis2(dist_matrix_relab ~ pcoa_relab_data$ext_lib_plate_nb)
manova

Selected for re-extraction

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "selected_for_re_extraction", c(1, 2)) 

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "selected_for_re_extraction", c(3, 4)) 

Library Pool

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "library_pool", c(1, 2))

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "library_pool", c(3, 4))

Lane

Code
pcoa_plot(pcoa_relab_data |> dplyr::filter(sequencing_lane %in% 1:4), pcoa_relab, "sequencing_lane", c(1, 2))

Code
pcoa_plot(pcoa_relab_data |> dplyr::filter(sequencing_lane %in% 1:4), pcoa_relab, "sequencing_lane", c(3, 4))

Visit

Code
tmp <- 
  pcoa_relab_data |> 
  dplyr::filter(
    sample_type == "Clinical sample", 
    visit_code %in% c(seq(1000, 1900, by =100), 2120)
    )

v_colors <- c("red", "green3", colorRampPalette(c("dodgerblue1", "dodgerblue4","purple"))(7))
  

pcoa_plot(tmp, pcoa_relab, "visit_code", c(1, 2)) +  scale_color_manual(values = v_colors) 

Code
pcoa_plot(tmp, pcoa_relab, "visit_code", c(3,4)) +  scale_color_manual(values = v_colors) 

Saving SummarizedExperiment objects

We first check that the SE object is formatted as it should for its integration in the MAE.

Code
# we remove the columns pid, visit_code, and location
colData(SE_mg) <- colData(SE_mg)[, !colnames(colData(SE_mg)) %in% c("pid", "visit_code", "location")]
# we also remove additional columns that are not needed for downstream analyses
colData(SE_mg) <- colData(SE_mg)[, !colnames(colData(SE_mg)) %in% c("combined_sample_type", "sample_category")]

SE_mg <- check_se(SE_mg)

We save the SE objects to disk

Code
saveRDS(
  SE_mg, 
  str_c(
    get_01_output_dir(),  
    "02_se_mg_", today() |> str_remove_all("-"), ".rds"
    )
  )

We also save the technical_metadata_agg object to disk so we can re-use it for the qPCR data preparation.

Code
saveRDS(
  technical_metadata_agg, 
  str_c(
    get_01_output_dir(),  
    "02_technical_metadata_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )
Code
SE_mg |> 
  as_tibble() |> 
  select(.feature, .sample, mg_uid, rel_ab) |> 
  pivot_wider(
    names_from = .feature, 
    values_from = rel_ab
  ) |>
  write_csv(
    file = "MG_final_rel_ab_for_Michael_to_check.csv",
    quote = "none"
  )